Step 0: Import Packages¶

In [69]:
import cpuinfo
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import time
import pandas as pd

Step 1: 檢查處理器型號與Python¶

In [2]:
import cpuinfo
In [3]:
# 利用「CPU真實型號」去網路查詢「CPI真實架構」與「CPU虛擬架構」是否一致,
# 不一致代表Python編譯器選擇非原生CPU版本,使用Selenium會非常慢
# 需要選擇正確的Python 編譯器版本
cpu_i = cpuinfo.get_cpu_info()
print(f"CPU虛擬架構:{cpu_i['arch_string_raw']}")
print(f"CPU真實型號:{cpu_i['brand_raw']}")
CPU虛擬架構:arm64
CPU真實型號:Apple M1 Pro

Step 2: 爬蟲工具準備工作¶

方法一¶

1、確認Chrome版本
2、下載對應版本之ChromeDriver,注意處理器型號,網址 https://chromedriver.chromium.org/chromedriver-canary

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
In [5]:
# # 方法一:自己檢查自己下載,Mac可能會被擋住
# driver_file = Service('../chromedriver_mac64/chromedriver')
# driver = webdriver.Chrome(service=driver_file)
In [6]:
# # 測試用
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# driver_file = Service('../chromedriver_mac64/chromedriver') # 你下載ChromeDrive的位置
# driver = webdriver.Chrome(service=driver_file)

方法二(推薦):使用webdriver-manager套件¶

1、pip install webdriver-manager

In [7]:
# !pip install webdriver-manager
In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
In [9]:
driver_file = ChromeDriverManager().install()
options = webdriver.ChromeOptions()
options.add_argument('--incognito') # 無痕模式
driver = webdriver.Chrome(service=Service(driver_file), options=options)
time.sleep(1)
driver.close()
In [10]:
# # 測試用
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager
# driver_file = ChromeDriverManager().install()
# driver = webdriver.Chrome(service=Service(driver_file))

Step 3: 爬蟲--分析FB網頁架構¶

Step-1: 列出要抓取的資料
Step-2: 分析Chrome中網頁原始碼的架構
Step-3: 觀察Chrome中目標資料element的特徵
Step-4: 搜索element,搜索的element的方式,非必要一律用CSS Selector
Step-5: 從element中提取資料
Note: 爬蟲會不斷重複2~5的步驟

In [11]:
## 小功能function

# 移動element至可見
def move_element_be_visible(driver, element):
    # driver.execute_script("arguments[0].scrollIntoView();", element) # 可以做到同樣效果的另一個方法
    element.location_once_scrolled_into_view
    driver.execute_script('window.scrollBy(0,-150)')
# 滑鼠移動至element
def mouse_move_to(driver, element):
    action = ActionChains(driver)
    action.move_to_element(element)
    action.perform()
In [12]:
# 一般模式登陸
# driver_file = ChromeDriverManager().install()
# driver = webdriver.Chrome(service=Service(driver_file))
# url = "https://www.facebook.com/groups/foodspick/?locale=zh_TW&checkpoint_src=any"
# driver.get(url)
# 手動登陸
In [13]:
# 無痕模式登陸
driver_file = ChromeDriverManager().install()
options = webdriver.ChromeOptions()
options.add_argument('--incognito') # 無痕模式
driver = webdriver.Chrome(service=Service(driver_file), options=options)
url = "https://www.facebook.com/groups/foodspick/?locale=zh_TW&checkpoint_src=any"
driver.get(url)
# 手動登陸
In [14]:
# 已載入的所有貼文(feed)
elements = driver.find_elements(By.CSS_SELECTOR, "div[class='x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z']")
elements
Out[14]:
[<selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_417")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_418")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_419")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_420")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_421")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_422")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_423")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_424")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_425")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_426")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_427")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_428")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_429")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_430")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_431")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_432")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_433")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_434")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_435")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_436")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_437")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_438")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_439")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_440")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_441")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_442")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_443")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_444")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_445")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_446")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_447")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_149")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_154")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_448")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_449")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_450")>,
 <selenium.webdriver.remote.webelement.WebElement (session="7a2b00a4e7436d8f2bc6aaf599ef2cf9", element="EB00B2B83E79BCFB1363D6C93824476D_element_451")>]
In [19]:
feed = elements[32] # 選出要抓的貼文
move_element_be_visible(driver, feed) # 將貼文移動至可見處

貼文內容¶

In [20]:
# 貼文發佈人
feed_man = feed.find_elements(By.CSS_SELECTOR, "h2>span[class='xt0psk2']")[0].text
print(feed_man)
傅鏡暉
In [21]:
# 貼文時間
locator = "div[class='xu06os2 x1ok221b'] > span[dir='auto'] > span > span> span > a[role='link'][tabindex='0']"
element = feed.find_elements(By.CSS_SELECTOR, locator)[0]
mouse_move_to(driver, element)
time.sleep(1) # 等待網頁反應
feed_time = driver.find_elements(By.CSS_SELECTOR, "div[role='tooltip']")[0].text
print(feed_time)
2024年5月3日 星期五下午9:47
In [22]:
# 貼文內容 -- 文字
locator = "div[dir='auto'] > div[class='x1iorvi4 x1pi30zi x1l90r2v x1swvt13'] > span[dir='auto']"
feed_content = feed.find_elements(By.CSS_SELECTOR, locator)[0].text
print(feed_content)
高CP值吃到飽自助餐,用餐空間高雅舒適,生魚片及海鮮新鮮、有蒸生蠔、牛排、干貝蟹肉煲、炸蝦、鴨肝醬等百種美食及哈根達斯冰淇淋。現正舉辦甜點季,自即日起到5/17,請到日本米其林甜點主廚製作10款巧克力甜點。
In [23]:
# 貼文內容 -- 連結
element = feed.find_elements(By.CSS_SELECTOR, "div[class='xmjcpbm x1n2onr6'] > div > a[rel='nofollow noreferrer'][role='link'][tabindex='0']")[0]
feed_link = element.get_attribute('href')
feed_link
Out[23]:
'https://lordcat.tw/archives/141343'
In [24]:
# 貼文內容 -- 連結文字
element = feed.find_elements(By.CSS_SELECTOR, "span[dir='auto'] > span > span[dir='auto']")[0]
feed_link_alt = element.text
feed_link_alt
Out[24]:
'高CP值吃到飽自助餐,生魚片、蒸生蠔、牛排、干貝蟹肉煲等百種美食及哈根達斯'
In [25]:
# 貼文內容 -- 照片
feed_pic = feed.find_elements(By.CSS_SELECTOR, "div[class='x10l6tqk x13vifvy'] > img")[0].get_attribute('src')
feed_pic
Out[25]:
'https://external.ftpe8-3.fna.fbcdn.net/emg1/v/t13/13706082881180517361?url=https%3A%2F%2Flordcat.tw%2Fwp-content%2Fuploads%2F2024%2F04%2F1714218163-a6f0166bd53bdcb983763ed0318a1f96.jpg&fb_obo=1&utld=lordcat.tw&stp=c0.5000x0.5000f_dst-jpg_flffffff_p1000x522_q75&ccb=13-1&oh=06_Q39951lI8udLAh0424JzpsIefLv6N5LrSkU_Q4cAoPFrgro&oe=663BF857&_nc_sid=085657'
In [28]:
# 按讚數
element = feed.find_elements(By.CSS_SELECTOR, "span[class='x1e558r4']")[0]
emotion = element.text
emotion
Out[28]:
'33'
In [44]:
# 留言數
element = feed.find_elements(By.CSS_SELECTOR, "div > span >div[role='button'][tabindex='0'] > div > div > span[dir='auto']")[0]
comment_number = element.text
comment_number
Out[44]:
'9'
In [45]:
# 轉發數
element = feed.find_elements(By.CSS_SELECTOR, "div > span >div[role='button'][tabindex='0'] > div > div > span[dir='auto']")[1]
comment_number = element.text
comment_number
Out[45]:
'2'

回覆內容抓取¶

In [63]:
# 留言人
element = feed.find_elements(By.CSS_SELECTOR, "span[class='x3nfvp2']")[0]
comment_man = element.text
comment_man
Out[63]:
'吳莉莉'
In [64]:
# 留言時間
element = feed.find_elements(By.CSS_SELECTOR, "div[class='x6s0dn4 x3nfvp2']")[0]
mouse_move_to(driver, element)
time.sleep(1) # 等待網頁反應
comment_time = driver.find_elements(By.CSS_SELECTOR, "div[role='tooltip']")[0].text
print(comment_time)
2024年5月3日 星期五下午11:08
In [65]:
# 留言內容
element = feed.find_elements(By.CSS_SELECTOR, "div[class='x1lliihq xjkvuk6 x1iorvi4']")[0]
comment_content = element.text
comment_content
Out[65]:
'母親節可以來這裡了'
In [75]:
# 資訊整理並輸出CSV
df_feeds = pd.DataFrame({"Feed Number":[1],
                         "Feed Man":[feed_man],
                         "Feed Time":[feed_time],
                         "Feed Content":[feed_content],
                         "Feed Content Picture":[feed_pic],
                         "Feed Link String":[feed_link_alt],
                         "Feed Link":[feed_link],
                         "Comment Man":[comment_man],
                         "Comment Time":[comment_time],
                         "Comment Content":[comment_content]})
df_feeds.to_csv("../feeds.csv", encoding="utf-8-sig", index=False)
Out[75]:
Feed Number Feed Man Feed Time Feed Content Feed Content Picture Feed Link String Feed Link Comment Man Comment Time Comment Content
0 1 傅鏡暉 2024年5月3日 星期五下午11:08 高CP值吃到飽自助餐,用餐空間高雅舒適,生魚片及海鮮新鮮、有蒸生蠔、牛排、干貝蟹肉煲、炸蝦、... https://external.ftpe8-3.fna.fbcdn.net/emg1/v/... 高CP值吃到飽自助餐,生魚片、蒸生蠔、牛排、干貝蟹肉煲等百種美食及哈根達斯 https://lordcat.tw/archives/141343 吳莉莉 2024年5月3日 星期五下午11:08 母親節可以來這裡了
In [1]:
driver.close()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 1
----> 1 driver.close()

NameError: name 'driver' is not defined
In [ ]: